******************************************************************
* This script processes loan-level repayment data and estimates the 
* Internal Rate of Return (IRR). It cleans the dataset, computes key 
* financial ratios, and simulates the cash flow profile of loans.
* The final IRR computation is exported for processing in Excel.
******************************************************************

clear 

* Import data from CSV to Stata format
import delimited "$input/profitability_anonym.csv", clear

* Fix column name typo
rename amoutn amount

* Count the number of loans in the dataset
count

* Summary statistics of loan maturity
su maturity, d

* Compute loan performance ratios
gen ratio_duration = observed / maturity  // Ratio of observed duration to maturity
gen ratio_due = capital / amount          // Ratio of capital due to total amount

* Summary statistics for computed ratios
su ratio*, d

* Compute annualized origination and administrative costs
gen orig_annual = (1.03)^(1/(round(maturity/12))) - 1
gen admin_annual = (1.0004)^12 - 1

* Adjusted interest rate after fees
gen rate_adj = max(rate - orig_annual*100 - admin_annual*100, 0)
gen total_fees = admin_annual + orig_annual

* Identify loans not fully repaid
gen not_full = ratio_duration < 0.95 & ratio_due > 0.05 

* Loans fully repaid -> Assume IRR equals rate_adj
preserve
    keep if not_full == 0
    gen default = 0
    gen charge_off = 0
    gen irr = rate_adj
    keep default charge_off irr
    save "$output/profitability_output.dta", replace
restore 

* Keep only loans that are not fully repaid
keep if not_full == 1
drop v1

* Generate unique loan IDs
gen id = _n

* Compute number of observed payment periods
gen observed_round = floor(observed)
gen nb_period = observed_round + 2  // One period before (time 0) and one terminal period

* Check the maximum number of periods
su nb_period, d

* Expand data to simulate cash flows
expand `r(max)'
bys id: gen period = _n - 1

* Monthly interest rate
gen monthly = (1 + rate_adj / 100)^(1/12) - 1

* Compute fixed repayment amount per period using annuity formula
gen repayment_fixed = amount * monthly / (1 - (1 / (1 + monthly))^maturity)

* No repayment at the initial period (time 0)
replace repayment_fixed = 0 if period == 0

* Set panel structure
xtset id period

* Compute remaining capital and interest for each period
gen restant_du = amount
gen interest = 0

su nb_period, d
forvalues i = 1(1) `r(max)' {
    replace restant_du = max((L.restant_du - repayment_fixed + L.interest), 0) if period == `i' 
    replace interest = restant_du * monthly if period == `i'
}

* Keep only observed payment periods
keep if period <= observed_round + 1

* Compute final remaining balance
bys id: gen restant_final = restant[_N]

* Count loans that are fully repaid
distinct id if restant_final <= 0

* Check loans initially marked as default but actually repaid
distinct id if restant_final <= 0 & default == 1
replace default = 0 if restant_final <= 0

* Process fully repaid loans
preserve
    keep if restant_final <= 0 
    keep if period == 1
    gen default = 0
    gen charge_off = 0
    gen irr = rate_adj
    keep default charge_off irr
    append using "$output/profitability_output.dta"
    sleep 2000
    save "$output/profitability_output.dta", replace
restore 

* Process loans with remaining balance
keep if restant_final > 0 

* Identify early repayers
distinct id if default == 0 
distinct id if default == 0 & observed_round <= 6 

* Terminal payment adjustments
replace repayment_f = 0 if default == 1 & (observed_round + 1 == period)
replace repayment_f = L.restant_du if default == 0 & (observed_round + 1 == period)

* Cash flow adjustment for IRR computation
replace repayment_f = -amount if period == 0

* Sort data before IRR export
sort id period
br id period repayment_f restant_du

* Export cash flow data for IRR calculation in Excel
preserve 
    keep id period repayment_f 
    reshape wide repayment_f, i(period) j(id)
    foreach var of varlist repayment* {
        replace `var' = 0 if `var' == .
    }
    export delimited using "$output/irr_computation.csv", replace
restore

* Compute charge-offs
gen charge_off = 0 if default == 0 & (observed_round + 1 == period)
replace charge_off = L.restant_du / amount if default == 1 & (observed_round + 1 == period)

* Keep only the final period for default analysis
keep if observed_round + 1 == period
gen default = defaulting
keep id default charge_off
save "$output/profitability_temp.dta", replace

*********************************************************************************

* Import IRR results from Excel
import delimited "$output/irr_output.csv", clear
reshape long repayment_fixed, i(period) j(id)
drop period
rename repayment_fixed irr

* Merge IRR results with default and charge-off data
merge 1:1 id using "$output/profitability_temp.dta"
keep if _merge == 3
drop _merge
drop id

* Append new IRR results to existing dataset
append using "$output/profitability_output.dta"
sleep 2000
save "$output/profitability_output.dta", replace

* Summary statistics
su irr default, d
su charge_off if default, d
